clear
set more off
set mem 1g
# delimit ;

/*****************************************************************************************************************;
**This dofile calculates the 'simulated' grid measures of arsenic contamination and generates hh_simulatedmindistance_YYYY;
pre-generated datasets this dofile uses:
hhXwelldata_uncontaminated_YYYY
hhXwelldata_contaminated_YYYY

*****************************************************************************************************************/;

*****************************************************************************************************************;
*Paths;
*****************************************************************************************************************;

*access files in subdirectory generated by data creating dofiles;
local indatasetU "dtafiles/hhXwelldata_uncontaminated";
local indatasetC "dtafiles/hhXwelldata_contaminated";
local outdataset "dtafiles/hh_simulatedmindistance";

*****************************************************************************************************************;
*Main
*****************************************************************************************************************;

*install necessary ado files;
ssc inst _gwtmean, replace;

foreach YYYY in 1999 2004 2007 {;

use "`indatasetU'_`YYYY'", clear;

*mathematically, only need to add wells within X+10.1 miles of the cluster center;
*where X is the distance from the cluster center to the closest uncontaminated well (call it well A);
*this way, point on other side of cluster is closer to the well A (distance X+5) than it is to;
*any potential wells just outside the radius (distance X+10.1-5);
egen mindistance=min(distance), by(dhsid);
drop if distance>max(15.1,mindistance+10.1);

for var lathhid-arsenic: rename X XU;

bysort dhsid: gen count=_n;
sum count;
local maxwellU=r(max);
drop distance mindistance;
reshape wide wellcountU latwellU longwellU arsenicU, i(dhsid) j(count);
order dhsid lathhid longhhid;

count;
local numU=r(N);

gen clusternum=_n;
sort clusternum;

tempfile U;
save `U', replace;

use "`indatasetC'_`YYYY'", clear;

*mathematically, only need to add wells within X+10.1 miles of the cluster center;
*where X is the distance from the cluster center to the closest uncontaminated well (call it well A);
*this way, point on other side of cluster is closer to the well A (distance X+5) than it is to;
*any potential wells just outside the radius (distance X+10.1-5);
egen mindistance=min(distance), by(dhsid);
drop if distance>max(15.1,mindistance+10.1);

for var lathhid-arsenic: rename X XC;

bysort dhsid: gen count=_n;
sum count;
local maxwellC=r(max);
drop distance mindistance;
reshape wide wellcountC latwellC longwellC arsenicC, i(dhsid) j(count);
order dhsid lathhid longhhid;

count;
local numC=r(N);

gen clusternum=_n;
sort clusternum;

tempfile C;
save `C', replace;

** code is written assuming numC = numU - need to fix if not true;
assert `numC'==`numU';


*** generate a grid of points;

clear;
local oneside=114;

set obs `oneside';
gen x=-1+2*(_n-1)/(sqrt(_N*_N)-1);
expand `oneside';
sort x;
gen y=-1+2*(mod(_n,sqrt(_N))+sqrt(_N)*(mod(_n,sqrt(_N))==0)-1)/(sqrt(_N)-1);

gen distance=sqrt((x-x[_n-1])^2+(y-y[_n-1])^2) if mod(_n,sqrt(_N))!=1;
gen distance2=sqrt((x-x[_n-sqrt(_N)])^2+(y-y[_n-sqrt(_N)])^2);

sum dis*;
drop dis*;
gen distance=sqrt(x^2+y^2);
drop if distance>1;
drop distance;

count;
local numgrid=r(N);
gen point=_n;
sort point;

expand `numU';

bysort point: gen clusternum=_n;

merge m:1 clusternum using `U';
assert _m==3; drop _m;

gen pointlatU=lathhidU+x*5/69.11;
gen pointlongU=longhhidU+y*5/(69.11*cos(lathhidU*_pi/180));

vincenty pointlat pointlong lathhid longhhid, h(distance_clusterU); 

forvalues i=1(1)`maxwellU' {;
	di "`i' of `maxwellU'";
	vincenty pointlat pointlong latwellU`i' longwellU`i', h(distance_wellU`i');
};

egen mindistanceU_grid=rmin(distance_wellU*);
gen invdistanceU=1/distance_cluster;

egen mean_mindistanceU=mean(mindistanceU_grid), by(dhsid);
egen wmean_mindistanceU=wtmean(mindistanceU_grid), weight(invdistance) by(dhsid);

gen mindistanceU_1=mindistanceU_grid<=1 if mindistanceU_grid!=.;
egen fraction_mindistanceU_1m=mean(mindistanceU_1), by(dhsid);
egen wfraction_mindistanceU_1m=wtmean(mindistanceU_1), weight(invdistance) by(dhsid);


** do the same with contaminated;
merge m:1 clusternum using `C';
assert _m==3; drop _m;

gen pointlatC=lathhidC+x*5/69.11;
gen pointlongC=longhhidC+y*5/(69.11*cos(lathhidC*_pi/180));

vincenty pointlatC pointlongC lathhidC longhhidC, h(distance_clusterC); 

forvalues i=1(1)`maxwellC' {;
	di "`i' of `maxwellC'";
	vincenty pointlatC pointlongC latwellC`i' longwellC`i', h(distance_wellC`i');
};

egen mindistanceC_grid=rmin(distance_wellC*);

gen mindistanceC_1=mindistanceC_grid<=1 if mindistanceC_grid!=.;
egen fraction_mindistanceC_1m=mean(mindistanceC_1), by(dhsid);
egen wfraction_mindistanceC_1m=wtmean(mindistanceC_1), weight(invdistance) by(dhsid);

** distance between point and cluster shouldn't depend on whether well is C or U;
assert distance_clusterU==distance_clusterC;

compress;

keep dhsid mean_* wmean_* fraction* wfraction*;
label variable mean_mindistanceU "Average Distance to Closest Uncontaminated Well";
label variable wmean_mindistanceU "Weighted Average Distance to Closest Uncontaminated Well";
label variable fraction_mindistanceU_1m "Fraction of Points within 1 m of Uncontaminated Well";
label variable wfraction_mindistanceU_1m "Weighted Fraction of Points within 1 m of Uncontaminated Well";

label variable fraction_mindistanceC_1m "Fraction of Points within 1 m of Contaminated Well";
label variable wfraction_mindistanceC_1m "Weighted Fraction of Points within 1 m of Contaminated Well";

duplicates drop;

sort dhsid;
save "`outdataset'_`YYYY'_grid", replace;

};

/*
(uniform()*2-1)*5/69.11;

1� of latitude = about 69.11 miles 

1� of longitude = about 69.11 miles along the equator. But all of the longitudes 
come together at the poles, so the farther you are from the equator, the fewer 
miles there are in one degree. 

Number of miles in 1� of longitude = (69.11) x (cosine of the latitude)

*/;

